@config(age=10 * 24 * 60 * 60) def tab_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
帖子详情页(T):
你可以看到结果里面出现了一些reply的东西,对于这些我们是可以不需要的,我们可以去掉。
同时我们还需要让他自己实现自动翻页功能。
代码:
1 2 3 4 5 6 7 8 9
@config(age=10 * 24 * 60 * 60) def board_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items(): url = each.attr.href if url.find('#reply')>0: url = url[0:url.find('#')] self.crawl(url, callback=self.detail_page, validate_cert=False) for each in response.doc('a.page_normal').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False) #实现自动翻页功能
@config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items(): self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60) def tab_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60) def board_page(self, response): for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items(): url = each.attr.href if url.find('#reply')>0: url = url[0:url.find('#')] self.crawl(url, callback=self.detail_page, validate_cert=False) for each in response.doc('a.page_normal').items(): self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)